


#****************************************************
#RUN THIS FIRST! THEN RUN THE FILE WITH "FINAL PAPER" IN TITLE

#THIS CODE CLEANS THE RAW DATA PER THE ORIGINAL AUTHOR'S STATA PROGRAM
#****************************************************






library(ipumsr)
library(dplyr)

#the next 3 lines are from NHIS directly to read in 
#the data w/ the right documentation. make sure the
#data (.gz) and ddi (.xml) are stored in the working directory

if (!require("ipumsr")) stop("Reading IPUMS data into R requires the ipumsr package. It can be installed using the following command: install.packages('ipumsr')")

setwd("YOUR WORKING DIRECTORY")
ddi <- read_ipums_ddi("nhis_00002.xml")#<--FILE THAT TELLS R HOW TO READ THW RAW DATA, WHICH MUST HAVE SAME FILE PREFIX
data <- read_ipums_micro(ddi)


#this is  recoding all the vars
#line by line like they do in their stata code
#there is a dplyr function that can recode and is
#more elegant, but this helped ensure we followed the stata code
newdata <- data[ which(data$YEAR>2007), ]
newdata <- data[ which(data$YEAR>2007
                         & data$AGE <51 ), ]

newdata$HISPETH<-ifelse(newdata$HISPETH==10, 0, ifelse(newdata$HISPETH>=20,1,NA))

newdata$race2<-newdata$RACEA
newdata$race2<-ifelse(newdata$race2==100, 0, newdata$race2)
newdata$race2<-ifelse(newdata$race2==200, 1,  newdata$race2)
newdata$race2<-ifelse(newdata$race2==310, NA,  newdata$race2)
newdata$race2<-ifelse(newdata$race2<499 & newdata$race2>=410 , 3,  newdata$race2)
newdata$race2<-ifelse(newdata$race2>=500, NA,  newdata$race2)
newdata$race2<-ifelse(newdata$HISPETH==1, 2,  newdata$race2)

newdata$married<-newdata$MARST
newdata$married<-ifelse(newdata$MARST==0, NA, newdata$married)
newdata$married<-ifelse(newdata$MARST>=11 & newdata$MARST<=12, 1, newdata$married)
newdata$married<-ifelse(newdata$MARST>=13, 0, newdata$married)


newdata$EDUCREC<-newdata$EDUCREC1
newdata$EDUCREC<-ifelse(newdata$EDUCREC1>=90, NA, newdata$EDUCREC)
newdata$EDUCREC<-ifelse(newdata$EDUCREC1==0, NA, newdata$EDUCREC)



newdata$yrschl<-newdata$EDUCREC-1


newdata$highschool<-ifelse(newdata$EDUCREC>=12, 1, ifelse(is.na(newdata$EDUCREC),NA,0))
newdata$somecollege<-ifelse(newdata$EDUCREC>=13, 1, ifelse(is.na(newdata$EDUCREC),NA,0))
newdata$college<-ifelse(newdata$EDUCREC>=15, 1, ifelse(is.na(newdata$EDUCREC),NA,0))

newdata$inccat<-newdata$INCFAM97ON2
newdata$inccat<-ifelse(newdata$INCFAM97ON2==10, 1, newdata$inccat)
newdata$inccat<-ifelse(newdata$INCFAM97ON2==20, 2, newdata$inccat)
newdata$inccat<-ifelse(newdata$INCFAM97ON2==31, 3, newdata$inccat)
newdata$inccat<-ifelse(newdata$INCFAM97ON2==32, 4, newdata$inccat)
newdata$inccat<-ifelse(newdata$INCFAM97ON2>=98, 5, newdata$inccat)


newdata$insurance<-newdata$HINOTCOVE
newdata$insurance<-ifelse(newdata$HINOTCOVE==2, 1, newdata$insurance)
newdata$insurance<-ifelse(newdata$HINOTCOVE==1, 0, newdata$insurance)
newdata$insurance<-ifelse(newdata$HINOTCOVE==3, 0, newdata$insurance)


#note that they changed their code afer the paper was published
#to get rid of one of their K6 index variables that should
#not have been included ("AFEELINT1MO"). I removed it here 
#to match their Stata code, but the paper version leaves it in

#NOTE i found a new error in their k6 -- they sum up ppl with at least 
#one nonmissing on the 6 component vars, but should only do all nonmissing

K6<-c("AEFFORT", "AHOPELESS", "ANERVOUS", "ARESTLESS", "ASAD", "AWORTHLESS")
# newdata$flag_index<-ifelse(newdata$AEFFORT>=6, 1, 0)
newdata[K6]<-lapply(newdata[K6], function(x) {ifelse(x>=6, NA, x)})


newdata$k6_index<-rowSums(newdata[K6],na.rm=TRUE)
newdata$k6_index<-ifelse(newdata$k6_index==0 & is.na(newdata$AEFFORT), NA, newdata$k6_index)

newdata$mood_dist<-ifelse(newdata$k6_index>=0 & newdata$k6_index<=4, 0, ifelse(newdata$k6_index>=5,1,NA))

newdata$health<-newdata$HEALTH
newdata$health<-ifelse(newdata$HEALTH>=7, NA, newdata$health)
newdata$health<-ifelse(newdata$HEALTH==1, 5, newdata$health)
newdata$health<-ifelse(newdata$HEALTH==2, 4, newdata$health)
newdata$health<-ifelse(newdata$HEALTH==4, 2, newdata$health)
newdata$health<-ifelse(newdata$HEALTH==5, 1, newdata$health)


newdata$poor_health<-ifelse(newdata$health<=2 & newdata$health>=0, 1, ifelse(newdata$health>2,0,NA))

newdata$foreign_born<-newdata$REGIONBR
newdata$foreign_born<-ifelse(newdata$REGIONBR==1, 0, newdata$foreign_born)
newdata$foreign_born<-ifelse(newdata$REGIONBR>=2 & newdata$REGIONBR<=11 , 1, newdata$foreign_born)
newdata$foreign_born<-ifelse(newdata$REGIONBR==99, NA, newdata$foreign_born)

newdata$imm16<-NA
newdata$temp<-newdata$YRSINUS
newdata$temp<-ifelse(newdata$YRSINUS==0, NA, newdata$temp)
newdata$temp<-ifelse(newdata$YRSINUS==1 , 1, newdata$temp)
newdata$temp<-ifelse(newdata$YRSINUS==2, 3, newdata$temp)
newdata$temp<-ifelse(newdata$YRSINUS==3, 7.5, newdata$temp)
newdata$temp<-ifelse(newdata$YRSINUS==4, 12.5, newdata$temp)
newdata$temp<-ifelse(newdata$YRSINUS==5, 22, newdata$temp)



newdata$imm16<-ifelse(newdata$foreign_born==0, 1, newdata$imm16)
newdata$imm16<-ifelse(newdata$AGE-newdata$temp<=16, 1, newdata$imm16)
newdata$imm16<-ifelse(newdata$AGE-newdata$temp>16, 0, newdata$imm16)

newdata$non_citizen<-newdata$CITIZEN
newdata$non_citizen<-ifelse(newdata$CITIZEN==2, 0, newdata$non_citizen)
newdata$non_citizen<-ifelse(newdata$CITIZEN>=7 & newdata$CITIZEN<=9 , 0, newdata$non_citizen)

table(newdata$non_citizen)

newdata$age_imm<-newdata$AGE-newdata$temp
newdata$age_imm<-ifelse(newdata$age_imm<=-1, 0, newdata$age_imm)



newdata$yr_mo<-newdata$YEAR + (newdata$INTERVWMO-1)/12

newdata$post<-ifelse(newdata$yr_mo>2012.5, 1, ifelse(newdata$yr_mo<=2012.5,0,NA))


newdata$birthmo<-newdata$BIRTHMO
newdata$birthmo<-ifelse(newdata$BIRTHMO>=13, NA, newdata$birthmo)


newdata$birthyr<-newdata$BIRTHYR
newdata$birthyr<-ifelse(newdata$BIRTHYR>=2016, NA, newdata$birthyr)

newdata$age_pol<-2012 - (newdata$YEAR-newdata$AGE)
  

newdata$elig1<-0
newdata$elig1<-ifelse(newdata$imm16==1 & newdata$non_citizen==1 & newdata$age_pol<=31, 1, newdata$elig1)

newdata$post_elig1<- newdata$post*newdata$elig1

newdata$SEX<-ifelse(newdata$SEX==2, 1, 0)


sampledata_orig <- newdata[ which(newdata$AGE>18&newdata$HISPETH==1 & newdata$non_citizen==1 
                             & (newdata$EDUCREC>=13 | is.na(newdata$EDUCREC) )
                             & (newdata$YRSINUS>2 | is.na(newdata$YRSINUS) )
                             ), ] #this is the final DS with all their exclusions!

summary(sampledata_orig$elig1)

